In this R notebook we are going to explore the data analytics and data visualization power of R.
In this example we are going to analyze the heart disease database from UCI machine library.
The dataset contains 76 predictors(features) and 303 observations. Patients with heart disease is binary coded as Presence given as 1 and No Presence as 0. The prerequiste to run in R Markdown is download the CSV data file in your working directory. This can be done by setting the current working directory as folows in R chunk: setwd("C:\\Users\\RajuPC\\Documents\\MyR")
First load the supporting R libraries
setwd("C:\\Users\\RajuPC\\Documents\\MyR") # Setting Woring Directory
library(tidyverse) #A high efficient data viz and manipulation R Library
library(caret) # A collection of Machine Learning Libraries
library(plotly) #A interaction Graphing System
library(ggsci) # A great collection of themes for ggplot
Loading of UCI heart disease data.
#Load the CSV data file
hci<-read_csv("heart.csv")
## Parsed with column specification:
## cols(
## age = col_integer(),
## sex = col_integer(),
## cp = col_integer(),
## trestbps = col_integer(),
## chol = col_integer(),
## fbs = col_integer(),
## restecg = col_integer(),
## thalach = col_integer(),
## exang = col_integer(),
## oldpeak = col_double(),
## slope = col_integer(),
## ca = col_integer(),
## thal = col_integer(),
## target = col_integer()
## )
hci$sex <- as.character(hci$sex)
hci$sex[hci$sex== 1] <- "Male"
hci$sex[hci$sex== 0] <- "Female"
summary(hci)
## age sex cp trestbps
## Min. :29.00 Length:303 Min. :0.000 Min. : 94.0
## 1st Qu.:47.50 Class :character 1st Qu.:0.000 1st Qu.:120.0
## Median :55.00 Mode :character Median :1.000 Median :130.0
## Mean :54.37 Mean :0.967 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:2.000 3rd Qu.:140.0
## Max. :77.00 Max. :3.000 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :240.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.3 Mean :0.1485 Mean :0.5281 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :1.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.399 Mean :0.7294
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.314 Mean :0.5446
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
tbl_df(hci)# A nicer view of the data as a table
## # A tibble: 303 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <int> <chr> <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 63 Male 3 145 233 1 0 150 0 2.3
## 2 37 Male 2 130 250 0 1 187 0 3.5
## 3 41 Fema~ 1 130 204 0 0 172 0 1.4
## 4 56 Male 1 120 236 0 1 178 0 0.8
## 5 57 Fema~ 0 120 354 0 1 163 1 0.6
## 6 57 Male 0 140 192 0 1 148 0 0.4
## 7 56 Fema~ 1 140 294 0 0 153 0 1.3
## 8 44 Male 1 120 263 0 1 173 0 0
## 9 52 Male 2 172 199 1 1 162 0 0.5
## 10 57 Male 2 150 168 0 1 174 0 1.6
## # ... with 293 more rows, and 4 more variables: slope <int>, ca <int>,
## # thal <int>, target <int>
Convert following predictors as factor for plotting
#Convert following predictors as factor for plotting
hci$sex<-as.factor(hci$sex)
hci$cp<-as.factor(hci$cp)
hci$thal<-as.factor(hci$thal)
hci$ca<-as.factor(hci$ca)
Distribution of Male and Female population across Age parameter
ggplotly(p1<-hci %>% ggplot(aes(x=age,fill=sex))+geom_bar()+xlab("Age") +
ylab("Number")+ guides(fill = guide_legend(title = "Gender"))
)%>% layout(legend = list(orientation = "h", x = 0, y = 1))
Representation of Cholestoral level
p2<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") +
ylab("Cholestoral")+ scale_fill_npg()+guides(fill = guide_legend(title = "Gender"))+
theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
ggplotly(p2)%>% layout(legend = list(orientation = "h", x = 0, y = 1))
Representation of Cholestoral level across different defect conditions
p3<-hci %>% ggplot(aes(x=age,y=chol,fill=sex, size=chol))+geom_point(alpha=0.7)+xlab("Age") +
ylab("Cholestoral")+facet_grid(.~fbs)+
theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
#ggsave("p3.png",plot=p3,dpi=300) To save the plot
ggplotly(p3)%>%layout(legend = list(orientation = "h", x = 0, y = 1))
Comparison of Blood pressure across pain type (0~3)
p4<-hci%>%ggplot(aes(x=sex,y=trestbps))+geom_boxplot(fill="darkorange")+xlab("Sex")+ylab("BP")+facet_grid(~cp)
ggplotly(p4)
Comparison of Cholestoral across pain type (0~3)
p5<-hci%>%ggplot(aes(x=sex,y=chol))+geom_boxplot(fill="#D55E00")+xlab("Sex")+ylab("Chol")+facet_grid(~cp)
ggplotly(p5)
Relation between Gender, Age, Cholestoral, BP
# Scatterplot
gg <- ggplot(hci, aes(x=age, y=chol, col=sex)) +
geom_point(aes( size=trestbps),shape=1,alpha=0.6) + theme_bw()+
geom_smooth(method="loess", se=F) +theme(plot.margin = margin(0.1,.1,.1,.1, "cm"))
ggplotly(gg)%>%layout(legend = list(orientation = "h", x = 0, y = 1))
First the data is partitioned into training and test datasets
# Create the training and test datasets
set.seed(100)
hci<-read_csv("heart.csv")
## Parsed with column specification:
## cols(
## age = col_integer(),
## sex = col_integer(),
## cp = col_integer(),
## trestbps = col_integer(),
## chol = col_integer(),
## fbs = col_integer(),
## restecg = col_integer(),
## thalach = col_integer(),
## exang = col_integer(),
## oldpeak = col_double(),
## slope = col_integer(),
## ca = col_integer(),
## thal = col_integer(),
## target = col_integer()
## )
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$target, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainData <- hci[trainRowNumbers,]
# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]
# Store X and Y for later use.
x = trainData[, 1:13]
trainData$target[trainData$target==1]<-"P"
trainData$target[trainData$target==0]<-"N"
y=trainData$target
testData$target[testData$target==1]<-"P"
testData$target[testData$target==0]<-"N"
yt=testData$target
# # See the structure of the new dataset
Normalization of features
preProcess_range_model <- preProcess(trainData, method='range')
preProcess_range_model1 <- preProcess(testData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
testData <- predict(preProcess_range_model1, newdata = testData)
# Append the Y variable
trainData$target <- as.factor(y)
testData$target<-as.factor(yt)
#apply(trainData[, 1:13], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
str(trainData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 243 obs. of 14 variables:
## $ age : num 0.25 0.562 0.583 0.562 0.312 ...
## $ sex : num 0 1 1 0 1 1 1 1 0 0 ...
## $ cp : num 0.333 0.333 0 0.333 0.333 ...
## $ trestbps: num 0.34 0.245 0.434 0.434 0.245 ...
## $ chol : num 0.178 0.251 0.151 0.384 0.313 ...
## $ fbs : num 0 0 0 0 0 1 0 0 1 0 ...
## $ restecg : num 0 0.5 0.5 0 0.5 0.5 0.5 0 0 0.5 ...
## $ thalach : num 0.771 0.817 0.588 0.626 0.779 ...
## $ exang : num 0 0 0 0 0 0 0 1 0 0 ...
## $ oldpeak : num 0.2258 0.129 0.0645 0.2097 0 ...
## $ slope : num 1 1 0.5 0.5 1 1 1 0.5 1 0.5 ...
## $ ca : num 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : num 0.667 0.667 0.333 0.667 1 ...
## $ target : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...
str(testData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60 obs. of 14 variables:
## $ age : num 0.8235 0.0588 0.6471 0.6471 0.5588 ...
## $ sex : num 1 1 0 1 1 0 1 1 0 1 ...
## $ cp : num 1 0.667 0 0.667 0 ...
## $ trestbps: num 0.593 0.419 0.302 0.651 0.535 ...
## $ chol : num 0.26693 0.33466 0.749 0.00797 0.29084 ...
## $ fbs : num 1 0 0 0 0 0 0 0 1 1 ...
## $ restecg : num 0 1 1 1 1 1 1 1 0 0 ...
## $ thalach : num 0.619 1 0.753 0.866 0.722 ...
## $ exang : num 0 0 1 0 0 0 0 1 0 0 ...
## $ oldpeak : num 0.411 0.625 0.107 0.286 0.214 ...
## $ slope : num 0 0 1 1 1 1 1 1 1 0 ...
## $ ca : num 0 0 0 0 0 0 0 0 0.25 0 ...
## $ thal : num 0 0.5 0.5 0.5 0.5 0.5 0.5 1 0.5 0.5 ...
## $ target : Factor w/ 2 levels "N","P": 2 2 2 2 2 2 2 2 2 2 ...
Detection of Heart disease by Earth ML method present in caret package
#fit control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model_mars2 = train(target ~ ., data=trainData, method='earth', tuneLength = 5, metric='ROC', trControl = fitControl)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
##
## Attaching package: 'TeachingDemos'
## The following object is masked from 'package:plotly':
##
## subplot
model_mars2
## Multivariate Adaptive Regression Spline
##
## 243 samples
## 13 predictor
## 2 classes: 'N', 'P'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 194, 194, 195, 195, 194
## Resampling results across tuning parameters:
##
## nprune ROC Sens Spec
## 2 0.7904281 0.7956710 0.7851852
## 5 0.8950297 0.7766234 0.8444444
## 9 0.9031826 0.7952381 0.8666667
## 13 0.9031826 0.7952381 0.8666667
## 17 0.9031826 0.7952381 0.8666667
##
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 9 and degree = 1.
# Step 2: Predict on testData and Compute the confusion matrix
predicted2 <- predict(model_mars2, testData)
confusionMatrix(reference = testData$target, data = predicted2, mode='everything')
## Confusion Matrix and Statistics
##
## Reference
## Prediction N P
## N 24 6
## P 6 24
##
## Accuracy : 0.8
## 95% CI : (0.6767, 0.8922)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.592e-06
##
## Kappa : 0.6
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8
## Specificity : 0.8
## Pos Pred Value : 0.8
## Neg Pred Value : 0.8
## Precision : 0.8
## Recall : 0.8
## F1 : 0.8
## Prevalence : 0.5
## Detection Rate : 0.4
## Detection Prevalence : 0.5
## Balanced Accuracy : 0.8
##
## 'Positive' Class : N
##
Comparison of some common ML methods using Models_Compare method
# Train the model using adaboost
model_adaboost = train(target ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model1 = train(target ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl)#KNN Model
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl)#SVM
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model2 = train(target ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl)#RandomForest
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
# Compare model performances using resample()
models_compare <- resamples(list(EARTH=model_mars2,ADABOOST=model_adaboost, KNN=model1,SVM=model2, RanF=model2))
# Summary of the models performances
summary(models_compare)
##
## Call:
## summary.resamples(object = models_compare)
##
## Models: EARTH, ADABOOST, KNN, SVM, RanF
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## EARTH 0.8668430 0.8989899 0.9048822 0.9031826 0.9175084 0.9276896 0
## ADABOOST 0.8451178 0.8483245 0.8569024 0.8618246 0.8783069 0.8804714 0
## KNN 0.7989418 0.8552189 0.8821549 0.8865961 0.9478114 0.9488536 0
## SVM 0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751 0
## RanF 0.6693122 0.7154882 0.7710438 0.7625541 0.8227513 0.8341751 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## EARTH 0.6190476 0.7727273 0.8571429 0.7952381 0.8636364 0.8636364 0
## ADABOOST 0.6818182 0.7142857 0.7272727 0.7316017 0.7619048 0.7727273 0
## KNN 0.6666667 0.7272727 0.8095238 0.7770563 0.8181818 0.8636364 0
## SVM 0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619 0
## RanF 0.5714286 0.7272727 0.7272727 0.7406926 0.7727273 0.9047619 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## EARTH 0.8148148 0.8518519 0.8518519 0.8666667 0.8518519 0.9629630 0
## ADABOOST 0.7407407 0.7777778 0.7777778 0.7925926 0.8148148 0.8518519 0
## KNN 0.7407407 0.8148148 0.8888889 0.8666667 0.8888889 1.0000000 0
## SVM 0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630 0
## RanF 0.7037037 0.7407407 0.8148148 0.8296296 0.9259259 0.9629630 0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)